home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Personal Computer World 2009 February
/
PCWFEB09.iso
/
Software
/
Resources
/
Chat & Communication
/
Digsby build 37
/
digsby_setup.exe
/
lib
/
lxml
/
html
/
clean.pyo
(
.txt
)
< prev
next >
Wrap
Python Compiled Bytecode
|
2008-10-13
|
14KB
|
579 lines
# Source Generated with Decompyle++
# File: in.pyo (Python 2.5)
import re
import copy
try:
from urlparse import urlsplit
except ImportError:
from urllib.parse import urlsplit
from lxml import etree
from lxml.html import defs
from lxml.html import fromstring, tostring, XHTML_NAMESPACE
from lxml.html import _nons, _transform_result
try:
set
except NameError:
from sets import Set as set
try:
unichr = __builtins__['unichr']
except (NameError, KeyError):
unichr = chr
try:
unicode = __builtins__['unicode']
except (NameError, KeyError):
unicode = str
try:
bytes = __builtins__['bytes']
except (NameError, KeyError):
bytes = str
try:
basestring = __builtins__['basestring']
except (NameError, KeyError):
basestring = (str, bytes)
__all__ = [
'clean_html',
'clean',
'Cleaner',
'autolink',
'autolink_html',
'word_break',
'word_break_html']
_css_javascript_re = re.compile('expression\\s*\\(.*?\\)', re.S | re.I)
_css_import_re = re.compile('@\\s*import', re.I)
_javascript_scheme_re = re.compile('\\s*(?:javascript|jscript|livescript|vbscript|about|mocha):', re.I)
_substitute_whitespace = re.compile('\\s+').sub
_conditional_comment_re = re.compile('\\[if[\\s\\n\\r]+.*?][\\s\\n\\r]*>', re.I | re.S)
_find_styled_elements = etree.XPath('descendant-or-self::*[@style]')
_find_external_links = etree.XPath("descendant-or-self::a [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']", namespaces = {
'x': XHTML_NAMESPACE })
class Cleaner(object):
scripts = True
javascript = True
comments = True
style = False
links = True
meta = True
page_structure = True
processing_instructions = True
embedded = True
frames = True
forms = True
annoying_tags = True
remove_tags = None
allow_tags = None
remove_unknown_tags = True
safe_attrs_only = True
add_nofollow = False
host_whitelist = ()
whitelist_tags = set([
'iframe',
'embed'])
def __init__(self, **kw):
for name, value in kw.items():
if not hasattr(self, name):
raise TypeError('Unknown parameter: %s=%r' % (name, value))
setattr(self, name, value)
_tag_link_attrs = dict(script = 'src', link = 'href', applet = [
'code',
'object'], iframe = 'src', embed = 'src', layer = 'src', a = 'href')
def __call__(self, doc):
if hasattr(doc, 'getroot'):
doc = doc.getroot()
for el in doc.iter():
tag = el.tag
if isinstance(tag, basestring):
el.tag = _nons(tag)
continue
for el in doc.iter('image'):
el.tag = 'img'
if not self.comments:
self.kill_conditional_comments(doc)
kill_tags = set()
if not self.remove_tags:
pass
remove_tags = set(())
if self.allow_tags:
allow_tags = set(self.allow_tags)
else:
allow_tags = set()
if self.scripts:
kill_tags.add('script')
if self.safe_attrs_only:
safe_attrs = set(defs.safe_attrs)
for el in doc.iter():
attrib = el.attrib
for aname in attrib.keys():
if aname not in safe_attrs:
del attrib[aname]
continue
if self.javascript:
if not self.safe_attrs_only:
for el in doc.iter():
attrib = el.attrib
for aname in attrib.keys():
if aname.startswith('on'):
del attrib[aname]
continue
doc.rewrite_links(self._remove_javascript_link, resolve_base_href = False)
if not self.style:
for el in _find_styled_elements(doc):
old = el.get('style')
new = _css_javascript_re.sub('', old)
new = _css_import_re.sub('', old)
if self._has_sneaky_javascript(new):
del el.attrib['style']
continue
if new != old:
el.set('style', new)
continue
for el in list(doc.iter('style')):
if el.get('type', '').lower().strip() == 'text/javascript':
el.drop_tree()
continue
if not el.text:
pass
old = ''
new = _css_javascript_re.sub('', old)
new = _css_import_re.sub('', old)
if self._has_sneaky_javascript(new):
el.text = '/* deleted */'
continue
if new != old:
el.text = new
continue
if self.comments or self.processing_instructions:
kill_tags.add(etree.Comment)
if self.processing_instructions:
kill_tags.add(etree.ProcessingInstruction)
if self.style:
kill_tags.add('style')
for el in _find_styled_elements(doc):
del el.attrib['style']
if self.links:
kill_tags.add('link')
elif self.style or self.javascript:
for el in list(doc.iter('link')):
if 'stylesheet' in el.get('rel', '').lower():
el.drop_tree()
continue
if self.meta:
kill_tags.add('meta')
if self.page_structure:
remove_tags.update(('head', 'html', 'title'))
if self.embedded:
for el in list(doc.iter('param')):
found_parent = False
parent = el.getparent()
while parent is not None and parent.tag not in ('applet', 'object'):
parent = parent.getparent()
if parent is None:
el.drop_tree()
continue
kill_tags.update(('applet',))
remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param'))
if self.frames:
kill_tags.update(defs.frame_tags)
if self.forms:
remove_tags.add('form')
kill_tags.update(('button', 'input', 'select', 'textarea'))
if self.annoying_tags:
remove_tags.update(('blink', 'marque'))
_remove = []
_kill = []
for el in doc.iter():
if el.tag in kill_tags:
if self.allow_element(el):
continue
_kill.append(el)
continue
if el.tag in remove_tags:
if self.allow_element(el):
continue
_remove.append(el)
continue
if _remove and _remove[0] == doc:
el = _remove.pop(0)
el.tag = 'div'
el.attrib.clear()
elif _kill and _kill[0] == doc:
el = _kill.pop(0)
if el.tag != 'html':
el.tag = 'div'
el.clear()
for el in _kill:
el.drop_tree()
for el in _remove:
el.drop_tag()
allow_tags = self.allow_tags
if self.remove_unknown_tags:
if allow_tags:
raise ValueError('It does not make sense to pass in both allow_tags and remove_unknown_tags')
allow_tags = set(defs.tags)
if allow_tags:
bad = []
for el in doc.iter():
if el.tag not in allow_tags:
bad.append(el)
continue
for el in bad:
el.drop_tag()
if self.add_nofollow:
for el in _find_external_links(doc):
if not self.allow_follow(el):
el.set('rel', 'nofollow')
continue
def allow_follow(self, anchor):
return False
def allow_element(self, el):
if el.tag not in self._tag_link_attrs:
return False
attr = self._tag_link_attrs[el.tag]
if isinstance(attr, (list, tuple)):
for one_attr in attr:
url = el.get(one_attr)
if not url:
return False
if not self.allow_embedded_url(el, url):
return False
continue
return True
else:
url = el.get(attr)
if not url:
return False
return self.allow_embedded_url(el, url)
def allow_embedded_url(self, el, url):
if self.whitelist_tags is not None and el.tag not in self.whitelist_tags:
return False
(scheme, netloc, path, query, fragment) = urlsplit(url)
netloc = netloc.lower().split(':', 1)[0]
if scheme not in ('http', 'https'):
return False
if netloc in self.host_whitelist:
return True
return False
def kill_conditional_comments(self, doc):
bad = []
self._kill_elements(doc, (lambda el: _conditional_comment_re.search(el.text)), etree.Comment)
def _kill_elements(self, doc, condition, iterate = None):
bad = []
for el in doc.iter(iterate):
if condition(el):
bad.append(el)
continue
for el in bad:
el.drop_tree()
def _remove_javascript_link(self, link):
new = _substitute_whitespace('', link)
if _javascript_scheme_re.search(new):
return ''
return link
_substitute_comments = re.compile('/\\*.*?\\*/', re.S).sub
def _has_sneaky_javascript(self, style):
style = self._substitute_comments('', style)
style = style.replace('\\', '')
style = _substitute_whitespace('', style)
style = style.lower()
if 'javascript:' in style:
return True
if 'expression(' in style:
return True
return False
def clean_html(self, html):
result_type = type(html)
if isinstance(html, basestring):
doc = fromstring(html)
else:
doc = copy.deepcopy(html)
self(doc)
return _transform_result(result_type, doc)
clean = Cleaner()
clean_html = clean.clean_html
_link_regexes = [
re.compile('(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\\-_.,a-z0-9%&?;=~]*)?)', re.I),
re.compile('mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_._]+[a-z]))', re.I)]
_avoid_elements = [
'textarea',
'pre',
'code',
'head',
'select',
'a']
_avoid_hosts = [
re.compile('^localhost', re.I),
re.compile('\\bexample\\.(?:com|org|net)$', re.I),
re.compile('^127\\.0\\.0\\.1$')]
_avoid_classes = [
'nolink']
def autolink(el, link_regexes = _link_regexes, avoid_elements = _avoid_elements, avoid_hosts = _avoid_hosts, avoid_classes = _avoid_classes):
if el.tag in avoid_elements:
return None
class_name = el.get('class')
if class_name:
class_name = class_name.split()
for match_class in avoid_classes:
if match_class in class_name:
return None
continue
for child in list(el):
autolink(child, link_regexes = link_regexes, avoid_elements = avoid_elements, avoid_hosts = avoid_hosts, avoid_classes = avoid_classes)
if child.tail:
(text, tail_children) = _link_text(child.tail, link_regexes, avoid_hosts, factory = el.makeelement)
if tail_children:
child.tail = text
index = el.index(child)
el[index + 1:index + 1] = tail_children
tail_children
if el.text:
(text, pre_children) = _link_text(el.text, link_regexes, avoid_hosts, factory = el.makeelement)
if pre_children:
el.text = text
el[:0] = pre_children
def _link_text(text, link_regexes, avoid_hosts, factory):
leading_text = ''
links = []
last_pos = 0
while None:
(best_match, best_pos) = (None, None)
for regex in link_regexes:
regex_pos = last_pos
while None:
match = regex.search(text, pos = regex_pos)
if match is None:
break
host = match.group('host')
for host_regex in avoid_hosts:
if host_regex.search(host):
regex_pos = match.end()
break
continue
else:
break
continue
if match is None:
continue
if best_pos is None or match.start() < best_pos:
best_match = match
best_pos = match.start()
continue
if best_match is None:
if links:
links[-1].tail = text
else:
leading_text = text
break
link = best_match.group(0)
end = best_match.end()
if link.endswith('.') or link.endswith(','):
end -= 1
link = link[:-1]
prev_text = text[:best_match.start()]
if links:
links[-1].tail = prev_text
else:
leading_text = prev_text
anchor = factory('a')
body = best_match.group('body')
if not body:
body = link
if body.endswith('.') or body.endswith(','):
body = body[:-1]
anchor.text = body
links.append(anchor)
text = text[end:]
continue
return (leading_text, links)
def autolink_html(html, *args, **kw):
result_type = type(html)
if isinstance(html, basestring):
doc = fromstring(html)
else:
doc = copy.deepcopy(html)
autolink(doc, *args, **kw)
return _transform_result(result_type, doc)
autolink_html.__doc__ = autolink.__doc__
_avoid_word_break_elements = [
'pre',
'textarea',
'code']
_avoid_word_break_classes = [
'nobreak']
def word_break(el, max_width = 40, avoid_elements = _avoid_word_break_elements, avoid_classes = _avoid_word_break_classes, break_character = unichr(8203)):
if el.tag in _avoid_word_break_elements:
return None
class_name = el.get('class')
if class_name:
dont_break = False
class_name = class_name.split()
for avoid in avoid_classes:
if avoid in class_name:
dont_break = True
break
continue
if dont_break:
return None
if el.text:
el.text = _break_text(el.text, max_width, break_character)
for child in el:
word_break(child, max_width = max_width, avoid_elements = avoid_elements, avoid_classes = avoid_classes, break_character = break_character)
if child.tail:
child.tail = _break_text(child.tail, max_width, break_character)
continue
def word_break_html(html, *args, **kw):
result_type = type(html)
doc = fromstring(html)
word_break(doc, *args, **kw)
return _transform_result(result_type, doc)
def _break_text(text, max_width, break_character):
words = text.split()
for word in words:
if len(word) > max_width:
replacement = _insert_break(word, max_width, break_character)
text = text.replace(word, replacement)
continue
return text
_break_prefer_re = re.compile('[^a-z]', re.I)
def _insert_break(word, width, break_character):
orig_word = word
result = ''
while len(word) > width:
start = word[:width]
breaks = list(_break_prefer_re.finditer(start))
if breaks:
last_break = breaks[-1]
if last_break.end() > width - 10:
start = word[:last_break.end()]
result += start + break_character
word = word[len(start):]
result += word
return result